library(NbClust)
library(factoextra)
library(ClusterR)
library(ggplot2)
library(scatterplot3d)
library(dplyr)
library(stringr)
library(VIM)
library(fpc)
library(plotly)

Loading Datasets

data1 <- read.csv('Data1.csv')
data2 <- read.csv('Data2.csv')
data3 <- read.csv('Data3.csv')
data4 <- read.csv('Data4.csv')
data5 <- read.csv('Data5.csv')
data6 <- read.csv('Data6.csv')
data7 <- read.csv('Data7.csv')
data8 <- read.csv('Data8.csv')
world_data <- read.csv('World Indicators.csv')

Dataset 1

We check K value recommendation from elbow method and silhouette method

set.seed(10)
fviz_nbclust(data1[,2:4], kmeans, method = 'wss')

fviz_nbclust(data1[,2:4], kmeans, method = 'silhouette')

K suggested by elbow and silhouette method = 6

#Setting k = 6 from silhouette recommendation
k <- 6
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-2):(k+2)){
  km <- kmeans(data1[, 2:4], i, nstart = 20)
  ch <- round(calinhara(data1[, 2:4],km$cluster),digits=2)
  print(paste(i,ch))
}
## [1] "4 98.68"
## [1] "5 146.82"
## [1] "6 262.71"
## [1] "7 225.1"
## [1] "8 478.81"

CH value for k = 7 is greater than CH value for k = 6, so we select k = 6

km <- kmeans(data1[, 2:4], 7, nstart = 20)
data1$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data1$Class, clusters = data1$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : 0 
## normalized mutual information  : 1 
## variation of information       : 0 
## normalized var. of information : 0 
## ---------------------------------------- 
## specificity                    : 1 
## sensitivity                    : 1 
## precision                      : 1 
## recall                         : 1 
## F-measure                      : 1 
## ---------------------------------------- 
## accuracy OR rand-index         : 1 
## adjusted-rand-index            : 1 
## jaccard-index                  : 1 
## fowlkes-mallows-index          : 1 
## mirkin-metric                  : 0 
## ----------------------------------------
## [1] 1

Jaccard index = 1, purity = 1 for our clustering solution

dist_matrix <- as.matrix(dist(data1[,2:4]))
hc.single <- hclust(dist(data1[,2:4]), method = 'single')
plot(hc.single)

From dendrogram, we can set number of clusters = 7

data1$h_cluster <- cutree(hc.single, 7)
external_validation(true_labels = data1$Class, clusters = data1$h_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : 0 
## normalized mutual information  : 1 
## variation of information       : 0 
## normalized var. of information : 0 
## ---------------------------------------- 
## specificity                    : 1 
## sensitivity                    : 1 
## precision                      : 1 
## recall                         : 1 
## F-measure                      : 1 
## ---------------------------------------- 
## accuracy OR rand-index         : 1 
## adjusted-rand-index            : 1 
## jaccard-index                  : 1 
## fowlkes-mallows-index          : 1 
## mirkin-metric                  : 0 
## ----------------------------------------
## [1] 1

Jaccard index = 1 and purity = 1 for our hierarchichal clustering ## Plotting data according to actual class, Kmeans clustering and Hierarchical clustering

#3d plotting according to actual class
fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster

fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig